In [3]:
import scvelo as scv
scv.logging.print_version()
/Users/anaconda3/lib/python3.9/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.24.4
  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
Running scvelo 0.3.1 (python 3.9.12) on 2024-02-21 23:52.
ERROR: XMLRPC request failed [code: -32500]
RuntimeError: PyPI no longer supports 'pip search' (or XML-RPC search). Please use https://pypi.org/search (via a browser) instead. See https://warehouse.pypa.io/api-reference/xml-rpc.html#deprecated-methods for more information.
In [4]:
import scvelo as scv
import scanpy as sc
import cellrank as cr
import numpy as np
import pandas as pd
import anndata as ad
scv.settings.verbosity = 3
scv.settings.set_figure_params('scvelo', facecolor='white', dpi=100, frameon=False)
cr.settings.verbosity = 2
In [5]:
scv.settings.verbosity = 3  # show errors(0), warnings(1), info(2), hints(3)
scv.settings.presenter_view = True  # set max width size for presenter view
scv.set_figure_params('scvelo')  # for beautified visualization
In [6]:
adata = sc.read('/Users/usri/Desktop/YAO.7feb/feb10/adata3_feb12.scVelo.h5ad')
In [7]:
adata
Out[7]:
AnnData object with n_obs × n_vars = 1770 × 1999
    obs: 'Barcode', 'old.barcode', 'orig.ident.x', 'nCount_spliced', 'nFeature_spliced', 'nCount_unspliced', 'nFeature_unspliced', 'nCount_ambiguous', 'nFeature_ambiguous', 'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.8', 'seurat_clusters.x', 'SCT_snn_res.0.5', 'clusters', 'percent.mt.x', 'SCT_snn_res.0.3', 'SCT_snn_res.0.2', 'orig.ident.y', 'nCount_RNA', 'nFeature_RNA', 'percent.mt.y', 'RNA_snn_res.0.6', 'seurat_clusters.y', 'Clusters', 'RNA_snn_res.0.2', 'RNA_snn_res.0.3', 'EMTScore.76GS', 'Clusters.old', 'Clusters.number', 'Clusters.term', 'UMAP_1', 'UMAP_2', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'n_counts', 'velocity_self_transition', 'velocity_length', 'velocity_confidence', 'velocity_confidence_transition', 'root_cells', 'end_points', 'velocity_pseudotime'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable', 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'velocity_gamma', 'velocity_qreg_ratio', 'velocity_r2', 'velocity_genes', 'spearmans_score', 'velocity_score', 'fit_alpha', 'fit_beta', 'fit_gamma', 'fit_t_', 'fit_scaling', 'fit_std_u', 'fit_std_s', 'fit_likelihood', 'fit_u0', 'fit_s0', 'fit_pval_steady', 'fit_steady_u', 'fit_steady_s', 'fit_variance', 'fit_alignment_scaling', 'fit_diff_kinetics', 'fit_pval_kinetics'
    uns: 'Clusters.number_sizes', 'clusters_colors', 'neighbors', 'paga', 'rank_velocity_genes', 'recover_dynamics', 'velocity_graph', 'velocity_graph_neg', 'velocity_params'
    obsm: 'X_pca', 'X_umap', 'velocity_umap'
    varm: 'PCs', 'fit_pvals_kinetics', 'loss'
    layers: 'Ms', 'Mu', 'ambiguous', 'fit_t', 'fit_tau', 'fit_tau_', 'matrix', 'spliced', 'unspliced', 'variance_velocity', 'velocity'
    obsp: 'connectivities', 'distances'
In [8]:
# show proportions of spliced/unspliced abundances
scv.utils.show_proportions(adata)
adata
Abundance of ['unspliced', 'spliced', 'ambiguous']: [0.17 0.81 0.02]
Out[8]:
AnnData object with n_obs × n_vars = 1770 × 1999
    obs: 'Barcode', 'old.barcode', 'orig.ident.x', 'nCount_spliced', 'nFeature_spliced', 'nCount_unspliced', 'nFeature_unspliced', 'nCount_ambiguous', 'nFeature_ambiguous', 'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.8', 'seurat_clusters.x', 'SCT_snn_res.0.5', 'clusters', 'percent.mt.x', 'SCT_snn_res.0.3', 'SCT_snn_res.0.2', 'orig.ident.y', 'nCount_RNA', 'nFeature_RNA', 'percent.mt.y', 'RNA_snn_res.0.6', 'seurat_clusters.y', 'Clusters', 'RNA_snn_res.0.2', 'RNA_snn_res.0.3', 'EMTScore.76GS', 'Clusters.old', 'Clusters.number', 'Clusters.term', 'UMAP_1', 'UMAP_2', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'n_counts', 'velocity_self_transition', 'velocity_length', 'velocity_confidence', 'velocity_confidence_transition', 'root_cells', 'end_points', 'velocity_pseudotime'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable', 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'velocity_gamma', 'velocity_qreg_ratio', 'velocity_r2', 'velocity_genes', 'spearmans_score', 'velocity_score', 'fit_alpha', 'fit_beta', 'fit_gamma', 'fit_t_', 'fit_scaling', 'fit_std_u', 'fit_std_s', 'fit_likelihood', 'fit_u0', 'fit_s0', 'fit_pval_steady', 'fit_steady_u', 'fit_steady_s', 'fit_variance', 'fit_alignment_scaling', 'fit_diff_kinetics', 'fit_pval_kinetics'
    uns: 'Clusters.number_sizes', 'clusters_colors', 'neighbors', 'paga', 'rank_velocity_genes', 'recover_dynamics', 'velocity_graph', 'velocity_graph_neg', 'velocity_params'
    obsm: 'X_pca', 'X_umap', 'velocity_umap'
    varm: 'PCs', 'fit_pvals_kinetics', 'loss'
    layers: 'Ms', 'Mu', 'ambiguous', 'fit_t', 'fit_tau', 'fit_tau_', 'matrix', 'spliced', 'unspliced', 'variance_velocity', 'velocity'
    obsp: 'connectivities', 'distances'
In [9]:
#Preprocess the data
scv.pp.filter_genes(adata, min_shared_counts=3)
scv.pp.normalize_per_cell(adata)
scv.pp.filter_genes_dispersion(adata, n_top_genes=3000)
scv.pp.log1p(adata)
Filtered out 623 genes that are detected 3 counts (shared).
WARNING: Did not normalize X as it looks processed already. To enforce normalization, set `enforce=True`.
WARNING: Did not normalize spliced as it looks processed already. To enforce normalization, set `enforce=True`.
WARNING: Did not normalize unspliced as it looks processed already. To enforce normalization, set `enforce=True`.
Skip filtering by dispersion since number of variables are less than `n_top_genes`.
/var/folders/fj/btmxy73n74zgn38kbjwfv7zh0000gn/T/ipykernel_36991/2067228601.py:5: DeprecationWarning: `log1p` is deprecated since scVelo v0.3.0 and will be removed in a future version. Please use `log1p` from `scanpy.pp` instead.
  scv.pp.log1p(adata)
In [10]:
adata
Out[10]:
AnnData object with n_obs × n_vars = 1770 × 1376
    obs: 'Barcode', 'old.barcode', 'orig.ident.x', 'nCount_spliced', 'nFeature_spliced', 'nCount_unspliced', 'nFeature_unspliced', 'nCount_ambiguous', 'nFeature_ambiguous', 'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.8', 'seurat_clusters.x', 'SCT_snn_res.0.5', 'clusters', 'percent.mt.x', 'SCT_snn_res.0.3', 'SCT_snn_res.0.2', 'orig.ident.y', 'nCount_RNA', 'nFeature_RNA', 'percent.mt.y', 'RNA_snn_res.0.6', 'seurat_clusters.y', 'Clusters', 'RNA_snn_res.0.2', 'RNA_snn_res.0.3', 'EMTScore.76GS', 'Clusters.old', 'Clusters.number', 'Clusters.term', 'UMAP_1', 'UMAP_2', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'n_counts', 'velocity_self_transition', 'velocity_length', 'velocity_confidence', 'velocity_confidence_transition', 'root_cells', 'end_points', 'velocity_pseudotime'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable', 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'velocity_gamma', 'velocity_qreg_ratio', 'velocity_r2', 'velocity_genes', 'spearmans_score', 'velocity_score', 'fit_alpha', 'fit_beta', 'fit_gamma', 'fit_t_', 'fit_scaling', 'fit_std_u', 'fit_std_s', 'fit_likelihood', 'fit_u0', 'fit_s0', 'fit_pval_steady', 'fit_steady_u', 'fit_steady_s', 'fit_variance', 'fit_alignment_scaling', 'fit_diff_kinetics', 'fit_pval_kinetics'
    uns: 'Clusters.number_sizes', 'clusters_colors', 'neighbors', 'paga', 'rank_velocity_genes', 'recover_dynamics', 'velocity_graph', 'velocity_graph_neg', 'velocity_params', 'log1p'
    obsm: 'X_pca', 'X_umap', 'velocity_umap'
    varm: 'PCs', 'fit_pvals_kinetics', 'loss'
    layers: 'Ms', 'Mu', 'ambiguous', 'fit_t', 'fit_tau', 'fit_tau_', 'matrix', 'spliced', 'unspliced', 'variance_velocity', 'velocity'
    obsp: 'connectivities', 'distances'
In [11]:
scv.pp.filter_and_normalize(adata, min_shared_counts=3, n_top_genes=2000)
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
WARNING: Did not normalize X as it looks processed already. To enforce normalization, set `enforce=True`.
WARNING: Did not normalize spliced as it looks processed already. To enforce normalization, set `enforce=True`.
WARNING: Did not normalize unspliced as it looks processed already. To enforce normalization, set `enforce=True`.
Skip filtering by dispersion since number of variables are less than `n_top_genes`.
WARNING: adata.X seems to be already log-transformed.
Logarithmized X.
computing moments based on connectivities
    finished (0:00:00) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
/Users/anaconda3/lib/python3.9/site-packages/scvelo/preprocessing/utils.py:705: DeprecationWarning: `log1p` is deprecated since scVelo v0.3.0 and will be removed in a future version. Please use `log1p` from `scanpy.pp` instead.
  log1p(adata)
In [12]:
#Compute velocity and velocity graph
#The gene-specific velocities are obtained by fitting a ratio between precursor (unspliced) and mature (spliced) mRNA abundances that well explains the steady states (constant transcriptional state) and then computing how the observed abundances deviate from what is expected in steady state.
scv.tl.velocity(adata)
computing velocities
    finished (0:00:00) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
In [13]:
scv.tl.velocity_graph(adata)
computing velocity graph (using 1/8 cores)
  0%|          | 0/1770 [00:00<?, ?cells/s]
    finished (0:00:02) --> added 
    'velocity_graph', sparse matrix with cosine correlations (adata.uns)
In [14]:
adata
Out[14]:
AnnData object with n_obs × n_vars = 1770 × 1376
    obs: 'Barcode', 'old.barcode', 'orig.ident.x', 'nCount_spliced', 'nFeature_spliced', 'nCount_unspliced', 'nFeature_unspliced', 'nCount_ambiguous', 'nFeature_ambiguous', 'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.8', 'seurat_clusters.x', 'SCT_snn_res.0.5', 'clusters', 'percent.mt.x', 'SCT_snn_res.0.3', 'SCT_snn_res.0.2', 'orig.ident.y', 'nCount_RNA', 'nFeature_RNA', 'percent.mt.y', 'RNA_snn_res.0.6', 'seurat_clusters.y', 'Clusters', 'RNA_snn_res.0.2', 'RNA_snn_res.0.3', 'EMTScore.76GS', 'Clusters.old', 'Clusters.number', 'Clusters.term', 'UMAP_1', 'UMAP_2', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'n_counts', 'velocity_self_transition', 'velocity_length', 'velocity_confidence', 'velocity_confidence_transition', 'root_cells', 'end_points', 'velocity_pseudotime'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable', 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'velocity_gamma', 'velocity_qreg_ratio', 'velocity_r2', 'velocity_genes', 'spearmans_score', 'velocity_score', 'fit_alpha', 'fit_beta', 'fit_gamma', 'fit_t_', 'fit_scaling', 'fit_std_u', 'fit_std_s', 'fit_likelihood', 'fit_u0', 'fit_s0', 'fit_pval_steady', 'fit_steady_u', 'fit_steady_s', 'fit_variance', 'fit_alignment_scaling', 'fit_diff_kinetics', 'fit_pval_kinetics'
    uns: 'Clusters.number_sizes', 'clusters_colors', 'neighbors', 'paga', 'rank_velocity_genes', 'recover_dynamics', 'velocity_graph', 'velocity_graph_neg', 'velocity_params', 'log1p'
    obsm: 'X_pca', 'X_umap', 'velocity_umap'
    varm: 'PCs', 'fit_pvals_kinetics', 'loss'
    layers: 'Ms', 'Mu', 'ambiguous', 'fit_t', 'fit_tau', 'fit_tau_', 'matrix', 'spliced', 'unspliced', 'variance_velocity', 'velocity'
    obsp: 'connectivities', 'distances'
In [15]:
#Plot results
#Finally, the velocities are projected onto any embedding specified in basis and visualized in one of three available ways: on single cell level, on grid level, or as streamplot as shown here.
scv.pl.velocity_embedding_stream(adata, basis='umap', color=['Clusters.term', 'Clusters.number'])
computing velocity embedding
    finished (0:00:00) --> added
    'velocity_umap', embedded velocity vectors (adata.obsm)
In [16]:
#Clusters
scv.pl.velocity_embedding_stream(adata, basis='umap', color=['Clusters', 'root_cells'], arrow_size=1, dpi=100)
In [17]:
scv.pl.velocity_embedding(adata, basis='umap', arrow_length=3, arrow_size=2, dpi=200,  color='Clusters.number')
In [18]:
#scv.pl.velocity_embedding_grid(adata, color='KRT17', layer=['velocity', 'spliced'], arrow_size=3)
scv.pl.velocity_embedding_grid(adata, color='KRT17', layer=['velocity'], arrow_size=2, arrow_length=3, dpi=150)
In [19]:
scv.pl.velocity_embedding_grid(adata, color='KRT17', layer=['spliced'], arrow_size=2, arrow_length=3, dpi=150)
In [20]:
scv.pl.velocity_embedding_grid(adata, color='KRT20', layer=['velocity'], arrow_size=2, arrow_length=3, dpi=150, )
In [21]:
scv.pl.velocity_embedding_grid(adata, color='KRT20', layer=['spliced'], arrow_size=2, arrow_length=3, dpi=150)
In [22]:
scv.tl.rank_velocity_genes(adata, groupby='Clusters')
#scv.DataFrame(adata.uns['rank_velocity_genes']['names']).head()
ranking velocity genes
    finished (0:00:00) --> added 
    'rank_velocity_genes', sorted scores by group ids (adata.uns) 
    'spearmans_score', spearmans correlation scores (adata.var)
In [23]:
#adata.uns
In [24]:
var_names = ['KRT17']
scv.pl.velocity(adata, var_names=var_names, colorbar=True, ncols=2)
/Users/anaconda3/lib/python3.9/site-packages/scvelo/plotting/scatter.py:655: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  smp = ax.scatter(
In [25]:
var_names = ['KRT19']
scv.pl.velocity(adata, var_names=var_names, colorbar=True, ncols=2)
/Users/anaconda3/lib/python3.9/site-packages/scvelo/plotting/scatter.py:655: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  smp = ax.scatter(
In [26]:
var_names = ['KRT20']
scv.pl.velocity(adata, var_names=var_names, colorbar=True, ncols=2)
/Users/anaconda3/lib/python3.9/site-packages/scvelo/plotting/scatter.py:655: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  smp = ax.scatter(
In [27]:
scv.pl.velocity_graph(adata, color='Clusters.number' )
In [28]:
scv.pl.velocity_graph(adata, color='Clusters.term' )
In [29]:
#velocity_pseudotime
scv.pl.velocity_graph(adata, color='velocity_pseudotime' )
In [30]:
scv.tl.terminal_states(adata)
scv.pl.velocity_embedding_grid(adata, basis='umap', color=['root_cells', 'end_points'], arrow_size=3, arrow_length=3, dpi=200 )
computing terminal states
    identified 1 region of root cells and 1 region of end points .
    finished (0:00:00) --> added
    'root_cells', root cells of Markov diffusion process (adata.obs)
    'end_points', end points of Markov diffusion process (adata.obs)
In [31]:
scv.pl.velocity(adata, basis='umap', var_names=["KRT17", "KRT18", "KRT19"])
/Users/anaconda3/lib/python3.9/site-packages/scvelo/plotting/scatter.py:655: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  smp = ax.scatter(
/Users/anaconda3/lib/python3.9/site-packages/scvelo/plotting/scatter.py:655: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  smp = ax.scatter(
/Users/anaconda3/lib/python3.9/site-packages/scvelo/plotting/scatter.py:655: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  smp = ax.scatter(
In [32]:
adata.write('/Users/usri/Desktop/YAO.7feb/feb10/adata.RNAvelocity.scapy.20feb.h5ad')
In [33]:
scv.pl.scatter(adata, color=['root_cells', 'end_points'])
In [34]:
#velocity_pseudotime
scv.pl.scatter(adata, color=['velocity_pseudotime'])
In [35]:
scv.tl.latent_time(adata)
scv.pl.scatter(adata,color='latent_time', color_map='gnuplot', colorbar=True)
computing latent time using root_cells as prior
    finished (0:00:00) --> added 
    'latent_time', shared time (adata.obs)
In [36]:
top_genes= adata.var['fit_likelihood'].sort_values(ascending=False).index
scv.pl.scatter(adata, basis=top_genes[:10], ncols=5, frameon=False)
In [37]:
scv.pl.scatter(adata, basis="KRT19", ncols=1, frameon=False)
In [38]:
scv.pl.scatter(adata, basis="KRT17", ncols=1, frameon=False)
In [39]:
scv.pl.scatter(adata, color=['root_cells', 'EMTScore.76GS'])
In [40]:
sc.pl.umap(adata, color="KRT17")
In [41]:
adata.var
Out[41]:
vst.mean vst.variance vst.variance.expected vst.variance.standardized vst.variable Accession Chromosome End Start Strand ... fit_likelihood fit_u0 fit_s0 fit_pval_steady fit_steady_u fit_steady_s fit_variance fit_alignment_scaling fit_diff_kinetics fit_pval_kinetics
HES4 2.011864 6.088610 4.699831 1.295495 1 ENSG00000188290 1 1000172 998962 - ... NaN NaN NaN NaN NaN NaN NaN NaN nan NaN
ISG15 3.800565 29.049519 12.996684 2.235148 1 ENSG00000187608 1 1014540 1001138 + ... NaN NaN NaN NaN NaN NaN NaN NaN nan NaN
AL592464.3 0.010169 0.129913 0.011446 1.253532 1 ENSG00000287396 1 2817767 2814056 + ... 0.001567 0.0 0.0 0.255973 0.058847 0.414124 2.964007 1.012848 nan NaN
ERRFI1 0.723729 1.622331 1.115570 1.454262 1 ENSG00000116285 1 8026309 8004404 - ... NaN NaN NaN NaN NaN NaN NaN NaN nan NaN
GPR157 0.502825 0.816555 0.700716 1.165315 1 ENSG00000180758 1 9129102 9100305 - ... 0.177171 0.0 0.0 0.479218 0.217318 1.264409 1.629003 2.039069 nan NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
L1CAM 0.028814 0.042697 0.033229 1.284924 1 ENSG00000198910 X 153886173 153861514 - ... 0.111293 0.0 0.0 0.278802 0.046617 0.183412 1.368152 1.853301 nan NaN
NAA10 4.915819 23.293644 20.054323 1.161527 1 ENSG00000102030 X 153935080 153929225 - ... NaN NaN NaN NaN NaN NaN NaN NaN nan NaN
FLNA 1.048588 2.958633 1.827654 1.618814 1 ENSG00000196924 X 154374638 154348524 - ... NaN NaN NaN NaN NaN NaN NaN NaN nan NaN
LAGE3 2.146328 5.909045 5.190976 1.138330 1 ENSG00000196976 X 154479257 154477769 - ... NaN NaN NaN NaN NaN NaN NaN NaN nan NaN
CTAG2 0.145198 0.535717 0.175973 3.044310 1 ENSG00000126890 X 154653579 154651972 - ... NaN NaN NaN NaN NaN NaN NaN NaN nan NaN

1376 rows × 33 columns

In [42]:
# find gene markers for each of the detected leiden clusters
# Perform the ranking
sc.tl.rank_genes_groups(adata, 'Clusters.term', method='wilcoxon')
# The result is stored in adata.uns['rank_genes_groups']
result = adata.uns['rank_genes_groups']

# Initialize a list to hold dataframes for each cluster
df_list = []
# Iterate over clusters
for group in result['names'].dtype.names:
    # Create a DataFrame for the current cluster
    df = pd.DataFrame({    
        'Genes': result['names'][group],
        'Scores': result['scores'][group],
        'Logfoldchanges': result['logfoldchanges'][group],
        'pvals': result['pvals'][group],
        'pvals_adj': result['pvals_adj'][group],
    })
    
    # Add a column for the cluster
    df['Cluster'] = group

    # Append the DataFrame to the list
    df_list.append(df)

# Concatenate all dataframes
df_all = pd.concat(df_list, ignore_index=True)

# Write DataFrame to a csv file
df_all.to_csv('/Users/usri/Desktop/YAO.7feb/feb10/scanpy_marker_genes.csv', index=False)
df_all
Out[42]:
Genes Scores Logfoldchanges pvals pvals_adj Cluster
0 APCDD1 12.732535 7.994702 3.899681e-37 1.427322e-32 Basal-like
1 PCP4 12.551043 7.022396 3.923096e-36 7.179462e-32 Basal-like
2 OXR1 12.412327 3.096099 2.240448e-35 2.733422e-31 Basal-like
3 SLC7A8 12.032034 3.951415 2.411465e-33 2.206551e-29 Basal-like
4 LEF1 11.876751 6.676805 1.563254e-32 1.144333e-28 Basal-like
... ... ... ... ... ... ...
183000 LDHA -5.866008 0.006572 4.464114e-09 3.598922e-08 Stem.TA.2
183001 MALAT1 -7.237833 -0.441098 4.559107e-13 5.164589e-12 Stem.TA.2
183002 RPL13A -7.252900 -0.107191 4.079416e-13 4.645634e-12 Stem.TA.2
183003 KRT19 -8.111961 -0.298748 4.980901e-16 7.197234e-15 Stem.TA.2
183004 FABP1 -8.755919 -0.765860 2.024473e-18 3.577872e-17 Stem.TA.2

183005 rows × 6 columns

In [43]:
df_all_f = df_all[(df_all.pvals_adj < 0.05)&(abs(df_all.Logfoldchanges)> 0)]
df_all_f
Out[43]:
Genes Scores Logfoldchanges pvals pvals_adj Cluster
0 APCDD1 12.732535 7.994702 3.899681e-37 1.427322e-32 Basal-like
1 PCP4 12.551043 7.022396 3.923096e-36 7.179462e-32 Basal-like
2 OXR1 12.412327 3.096099 2.240448e-35 2.733422e-31 Basal-like
3 SLC7A8 12.032034 3.951415 2.411465e-33 2.206551e-29 Basal-like
4 LEF1 11.876751 6.676805 1.563254e-32 1.144333e-28 Basal-like
... ... ... ... ... ... ...
183000 LDHA -5.866008 0.006572 4.464114e-09 3.598922e-08 Stem.TA.2
183001 MALAT1 -7.237833 -0.441098 4.559107e-13 5.164589e-12 Stem.TA.2
183002 RPL13A -7.252900 -0.107191 4.079416e-13 4.645634e-12 Stem.TA.2
183003 KRT19 -8.111961 -0.298748 4.980901e-16 7.197234e-15 Stem.TA.2
183004 FABP1 -8.755919 -0.765860 2.024473e-18 3.577872e-17 Stem.TA.2

30652 rows × 6 columns

In [44]:
df_all_f.to_csv('/Users/usri/Desktop/YAO.7feb/feb10/scvelo__markers.Clusters.term.csv')
In [45]:
scv.pl.velocity_embedding_stream(adata, basis='umap', color = 'Clusters.term', legend_loc='on data')
In [46]:
scv.tl.velocity_confidence(adata)
keys = 'velocity_length', 'velocity_confidence'
scv.pl.scatter(adata, c = keys, cmap='coolwarm', perc = [5,95])
--> added 'velocity_length' (adata.obs)
--> added 'velocity_confidence' (adata.obs)
In [47]:
adata.obs
Out[47]:
Barcode old.barcode orig.ident.x nCount_spliced nFeature_spliced nCount_unspliced nFeature_unspliced nCount_ambiguous nFeature_ambiguous nCount_SCT ... initial_size n_counts velocity_self_transition velocity_length velocity_confidence velocity_confidence_transition root_cells end_points velocity_pseudotime latent_time
possorted_genome_bam_K9RX7:AAACCCAGTCCGAAGAx AAACCCAGTCCGAAGA-1 possorted_genome_bam_K9RX7:AAACCCAGTCCGAAGAx possorted 22913.0 4969.0 5090.0 2782.0 3272.0 1358.0 16128.0 ... 22913.0 NaN 0.102989 9.460000 0.921991 0.133375 0.017141 0.169256 0.130356 0.537701
possorted_genome_bam_K9RX7:AAACCCATCACTGTCCx AAACCCATCACTGTCC-1 possorted_genome_bam_K9RX7:AAACCCATCACTGTCCx possorted 27440.0 5237.0 5162.0 2843.0 3534.0 1437.0 16040.0 ... 27440.0 NaN 0.232820 10.370000 0.928700 0.118324 0.063491 0.194884 0.117105 0.471891
possorted_genome_bam_K9RX7:AAACGCTAGGCTGGATx AAACGCTAGGCTGGAT-1 possorted_genome_bam_K9RX7:AAACGCTAGGCTGGATx possorted 15336.0 4196.0 4478.0 2170.0 2441.0 964.0 15307.0 ... 15336.0 NaN 0.232637 24.530001 0.879951 0.023200 0.582353 0.061875 0.201713 0.498656
possorted_genome_bam_K9RX7:AAACGCTCATGAGTAAx AAACGCTCATGAGTAA-1 possorted_genome_bam_K9RX7:AAACGCTCATGAGTAAx possorted 23971.0 4398.0 3746.0 1987.0 2856.0 1079.0 15695.0 ... 23971.0 NaN 0.162755 8.260000 0.928741 0.092724 0.213005 0.039025 0.230673 0.213358
possorted_genome_bam_K9RX7:AAAGAACCAACTGAAAx AAAGAACCAACTGAAA-1 possorted_genome_bam_K9RX7:AAAGAACCAACTGAAAx possorted 7347.0 2784.0 2622.0 1656.0 1071.0 633.0 14497.0 ... 7347.0 NaN 0.322005 13.040000 0.907942 -0.043836 0.141615 0.071429 0.083507 0.360468
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
possorted_genome_bam_K9RX7:TTTGATCTCAATCGGTx TTTGATCTCAATCGGT-1 possorted_genome_bam_K9RX7:TTTGATCTCAATCGGTx possorted 28138.0 5320.0 6734.0 3200.0 3665.0 1412.0 15979.0 ... 28138.0 NaN 0.175585 8.380000 0.958243 0.046899 0.023345 0.827413 0.135706 0.515914
possorted_genome_bam_K9RX7:TTTGGAGCAGGCACTCx TTTGGAGCAGGCACTC-1 possorted_genome_bam_K9RX7:TTTGGAGCAGGCACTCx possorted 25570.0 5454.0 7959.0 3454.0 3740.0 1506.0 16175.0 ... 25570.0 NaN 0.310259 23.280001 0.875293 -0.116175 0.110187 0.021070 0.107341 0.499428
possorted_genome_bam_K9RX7:TTTGGAGGTAACATGAx TTTGGAGGTAACATGA-1 possorted_genome_bam_K9RX7:TTTGGAGGTAACATGAx possorted 23972.0 5173.0 3411.0 2096.0 3153.0 1283.0 16057.0 ... 23972.0 NaN 0.260342 12.710000 0.971284 -0.128660 0.027016 0.354314 0.139483 0.536332
possorted_genome_bam_K9RX7:TTTGGAGTCGTGGGTCx TTTGGAGTCGTGGGTC-1 possorted_genome_bam_K9RX7:TTTGGAGTCGTGGGTCx possorted 4873.0 1973.0 1029.0 784.0 613.0 394.0 14611.0 ... 4873.0 NaN 0.087861 10.470000 0.928094 0.121362 0.146675 0.144499 0.330151 0.234698
possorted_genome_bam_K9RX7:TTTGGAGTCTGCGGACx TTTGGAGTCTGCGGAC-1 possorted_genome_bam_K9RX7:TTTGGAGTCTGCGGACx possorted 53582.0 6531.0 9854.0 3972.0 6494.0 1875.0 15283.0 ... 53582.0 NaN 0.061931 9.140000 0.940884 0.223247 0.036231 0.070889 0.095348 0.276820

1770 rows × 45 columns

In [48]:
#df = adata.obs.groupby('Clusters.term')[keys].mean().T
#df.style.background_gradient(cmap='coolwarm', axis=1)
In [49]:
scv.pl.paga(adata, basis='umap', size=50, alpha=.1, min_edge_width=2, node_size_scale=1.5, color = 'Clusters.term')
WARNING: Invalid color key. Using grey instead.
/Users/anaconda3/lib/python3.9/site-packages/networkx/convert.py:157: DeprecationWarning: 

The scipy.sparse array containers will be used instead of matrices
in Networkx 3.0. Use `from_scipy_sparse_array` instead.
  return nx.from_scipy_sparse_matrix(data, create_using=create_using)
/Users/anaconda3/lib/python3.9/site-packages/networkx/convert.py:157: DeprecationWarning: 

The scipy.sparse array containers will be used instead of matrices
in Networkx 3.0. Use `from_scipy_sparse_array` instead.
  return nx.from_scipy_sparse_matrix(data, create_using=create_using)
In [50]:
op_genes = adata.var['velocity_qreg_ratio'].sort_values(ascending=False).index[:600]
scv.pl.heatmap(adata, var_names=top_genes, sortby='latent_time', col_color='Clusters.term', n_convolve=100)
/Users/anaconda3/lib/python3.9/site-packages/seaborn/matrix.py:213: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
  self.cmap = mpl.cm.get_cmap(cmap)
In [51]:
var_names = ['KRT17', 'KRT19', 'KRT20']
scv.pl.scatter(adata, var_names, color = ['Clusters.term'], frameon=False)
#scv.pl.scatter(adata, x='latent_time', color = ['Clusters.term'],y=var_names, frameon=False)
In [52]:
#Cluster-specific top-likelihood genes
#Moreover, partial gene likelihoods can be computed for a each cluster of cells to enable cluster-specific identification of potential drivers.

scv.tl.rank_dynamical_genes(adata, groupby='Clusters.term')
df = scv.get_df(adata, 'rank_dynamical_genes/names')
df.head(50)
ranking genes by cluster-specific likelihoods
    finished (0:00:00) --> added 
    'rank_dynamical_genes', sorted scores by group ids (adata.uns)
Out[52]:
Basal-like Entero-like Sec-like Stem.TA.1 Stem.TA.2
0 RBP1 PAPSS2 LIMA1 GPA33 KCTD16
1 KCNJ8 DACH1 ATAD2 KIT KIF4A
2 FAM171B LPIN2 MYH9 LGALS4 GPA33
3 ANKRD34B NMU PRKDC KCTD16 HNMT
4 ATAD2 KIT GPA33 IQGAP2 PGK1
5 MT4 RND3 PLPP1 LPIN2 LGALS4
6 TMPRSS11E PRKDC LDHA HMGCS2 ATAD2
7 HMCN1 TDP2 G2E3 ATAD2 LPIN2
8 TCEA2 PLPP1 TPD52L1 ZNF704 FOS
9 TRPC3 P4HA1 CALB1 TPX2 ZNF704
10 CPVL C15orf48 TSPAN8 PGK1 TPX2
11 ISM1 FER1L6 INSIG1 DBF4 RND3
12 CDK19 KCTD16 CENPF PAPSS2 CENPF
13 HAS2 HMGCS2 SLC4A7 CDC25A CLSPN
14 AL355916.1 CA9 TUBA1C TCOF1 PARM1
15 IDI1 TSPAN5 PIF1 P4HA1 TMEM45B
16 PCP4 AIG1 HMGCS1 FOS IQGAP2
17 SPTSSB RHBDL2 TPX2 CDCA4 BLM
18 LINC01116 NXPE4 NCAPG2 TSPAN8 CDCA4
19 CCDC80 CD276 BMP4 AKAP7 CALB1
20 OSBPL6 BCAS1 SKA3 CLSPN NMU
21 KIT MAP4K4 RRM1 NCAPH FANCI
22 APCDD1 PFKFB4 SLC12A2 TMEM45B DBF4
23 SERPINI1 SRPX2 XRCC2 CALB1 FAM83D
24 ZNF618 PARM1 TSC22D1 RND3 TSPAN8
25 GNAI1 TSC22D1 KRT18 TSC22D1 LIMA1
26 TRDC FAM222A EFNB2 KIF4A TCOF1
27 CLCA2 LGALS4 ARL6IP1 CENPF TSC22D1
28 LEF1 FOS TROAP HNMT EXO1
29 FGF20 HNMT ATP2B1 KIF11 KIF11
30 ZNF704 YPEL2 MELK CDK19 IDI1
31 HEPHL1 SLC12A2 DBF4 NMU FANCB
32 MMP2 SORL1 TRIM31 NCAPG2 MELK
33 G2E3 CDK19 CEP152 PLPP1 NCAPH
34 SLC1A3 IDI1 SULT1B1 PARM1 HMGCS2
35 TSPAN5 HHLA2 KIFC1 FAM83D DSCC1
36 SRPX ART3 HJURP KIF2C PLPP1
37 CALB1 BNIP3L KIF20B FANCB ALCAM
38 YPEL5 YPEL5 ARHGEF39 IDI1 TUBA1C
39 PRRX1 CEMIP MSMO1 TDP2 G2E3
40 CHST11 NR1H4 MORC4 TUBA1C E2F8
41 FAM178B TNFSF15 ATF3 WDHD1 EFNB2
42 LRRN3 LGI4 C10orf99 SGO2 RRM1
43 GATA3 TRIB1 PDCD4 MAP4K4 AIG1
44 RAB7B HAVCR1 ESCO2 CASP8AP2 CDCA2
45 MAN1A1 TPD52L1 NDC80 COLCA2 PAPSS2
46 FHOD3 LIPH KIF14 LIMA1 MAOA
47 AC005162.3 BHLHE40 MSH6 SYTL2 MCM10
48 P4HA1 MYH9 KRT19 PRKDC CCNE2
49 MAP2 ARHGAP29 XAF1 ALCAM CD276
In [53]:
kwargs = dict(frameon=False, size=10, linewidth=1.5,
              add_outline='Basal-like, Entero-like, Sec-like, Stem.TA.1, Stem.TA.2')

scv.pl.scatter(adata, df['Basal-like'][:8], ylabel='Basal-like', **kwargs)
scv.pl.scatter(adata, df['Entero-like'][:8], ylabel='Entero-like', **kwargs)
scv.pl.scatter(adata, df['Sec-like'][:8], ylabel='Sec-like', **kwargs)
scv.pl.scatter(adata, df['Stem.TA.1'][:8], ylabel='Stem.TA.1', **kwargs)
scv.pl.scatter(adata, df['Stem.TA.2'][:8], ylabel='Stem.TA.2', **kwargs)
In [54]:
#Velocities in cycling progenitors¶
#The cell cycle detected by RNA velocity, is biologically affirmed by cell cycle scores (standardized scores of mean expression levels of phase marker genes).
scv.tl.score_genes_cell_cycle(adata)
scv.pl.scatter(adata, color_gradients=['S_score', 'G2M_score'], smooth=True, perc=[5, 95])
calculating cell cycle phase
-->     'S_score' and 'G2M_score', scores of cell cycle phases (adata.obs)
In [55]:
s_genes, g2m_genes = scv.utils.get_phase_marker_genes(adata)
s_genes = scv.get_df(adata[:, s_genes], 'spearmans_score', sort_values=True).index
g2m_genes = scv.get_df(adata[:, g2m_genes], 'spearmans_score', sort_values=True).index

kwargs = dict(frameon=False, ylabel='cell cycle genes')
scv.pl.scatter(adata, list(s_genes[:2]) + list(g2m_genes[:3]), **kwargs)
In [56]:
#Top-likelihood genes:: Identified driver genes with pronounced dynamic behavior
#Driver genes display pronounced dynamic behavior and are systematically detected via their characterization by high likelihoods in the dynamic model
top_genes = adata.var['fit_likelihood'].sort_values(ascending=False).index
scv.pl.scatter(adata, basis=top_genes[:20], ncols=5, frameon=False)
In [57]:
top_genes = adata.var['fit_likelihood'].sort_values(ascending=False).index[:300]
scv.pl.heatmap(adata, var_names=top_genes, sortby='latent_time', col_color='clusters', n_convolve=100)
/Users/anaconda3/lib/python3.9/site-packages/seaborn/matrix.py:213: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
  self.cmap = mpl.cm.get_cmap(cmap)
In [58]:
var_names = ['KRT17', 'KRT20', 'KRT19']
scv.pl.scatter(adata, var_names, frameon=False)
scv.pl.scatter(adata, x='latent_time', y=var_names, frameon=False)
In [59]:
#Testing top-likelihood genes¶
#Screening through the top-likelihood genes, we find some gene-wise dynamics that display multiple kinetic regimes.
In [ ]:
scv.tl.recover_dynamics(adata)
In [ ]:
top_genes = adata.var['fit_likelihood'].sort_values(ascending=False).index[:100]
scv.tl.differential_kinetic_test(adata, var_names=top_genes, groupby='Clusters.term')
In [ ]:
scv.tl.velocity_pseudotime(adata)
scv.pl.scatter(adata, color='velocity_pseudotime', cmap='gnuplot')
In [ ]:
scv.tl.latent_time(adata)
scv.pl.scatter(adata, color='latent_time', color_map='gnuplot', size=80)
In [ ]:
top_genes = adata.var['fit_likelihood'].sort_values(ascending=False).index[:300]
scv.pl.heatmap(adata, var_names=top_genes, sortby='latent_time', col_color='clusters', n_convolve=100)
In [ ]:
scv.pl.velocity_embedding_stream(adata, basis='umap', color='Clusters')
In [66]:
scv.pl.paga(adata, basis='umap', size=50, alpha=.1,
            min_edge_width=2, node_size_scale=1.5)
WARNING: Invalid color key. Using grey instead.
/Users/anaconda3/lib/python3.9/site-packages/networkx/convert.py:157: DeprecationWarning: 

The scipy.sparse array containers will be used instead of matrices
in Networkx 3.0. Use `from_scipy_sparse_array` instead.
  return nx.from_scipy_sparse_matrix(data, create_using=create_using)
In [67]:
scv.tl.velocity(adata, mode='steady_state')
scv.tl.velocity(adata, mode='stochastic')
scv.tl.velocity(adata, mode='dynamical')
computing velocities
    finished (0:00:00) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
computing velocities
    finished (0:00:00) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
computing velocities
    finished (0:00:01) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
In [69]:
scv.pl.scatter(adata, basis=[top_genes, ])
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [69], in <cell line: 1>()
----> 1 scv.pl.scatter(adata, basis=[top_genes, ])

File /Users/anaconda3/lib/python3.9/site-packages/scvelo/plotting/scatter.py:204, in scatter(adata, basis, x, y, vkey, color, use_raw, layer, color_map, colorbar, palette, size, alpha, linewidth, linecolor, perc, groups, sort_order, components, projection, legend_loc, legend_loc_lines, legend_fontsize, legend_fontweight, legend_fontoutline, legend_align_text, xlabel, ylabel, title, fontsize, figsize, xlim, ylim, add_density, add_assignments, add_linfit, add_polyfit, add_rug, add_text, add_text_pos, add_margin, add_outline, outline_width, outline_color, n_convolve, smooth, normalize_data, rescale_color, color_gradients, dpi, frameon, zorder, ncols, nrows, wspace, hspace, show, save, ax, **kwargs)
    202     multikey = unique(multikey)  # take unique set if no more than one multikey
    203 if len(multikey) > 20:
--> 204     raise ValueError("Please restrict the passed list to max 20 elements.")
    205 if ax is not None:
    206     logg.warn("Cannot specify `ax` when plotting multiple panels.")

ValueError: Please restrict the passed list to max 20 elements.
In [70]:
scv.pl.scatter(adata, basis="KRT17")
In [71]:
scv.tl.velocity(adata, mode='stochastic')
@scv.tl.velocity_graph(adata)
#scv.tl.umap(adata)
#scv.pl.velocity_embedding_stream(adata, basis='umap', color=['clusters'])
  Input In [71]
    #scv.pl.velocity_embedding_stream(adata, basis='umap', color=['clusters'])
                                                                              ^
SyntaxError: unexpected EOF while parsing
In [72]:
#top likelihood genes
print(adata.var['velocity_genes'].sum(), adata.n_vars)
top_genes = adata.var_names[adata.var.fit_likelihood.argsort()[::-1]]
scv.pl.scatter(adata, basis=top_genes[:10], ncols=5)
438 1376
In [ ]: